import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
# Import data
df = pd.read_csv("gapminder_clean.csv")
df.dropna(subset = ["CO2 emissions (metric tons per capita)"], inplace=True)
df.dropna(subset = ["gdpPercap"], inplace=True)
# Scatter plot
filtered_data = df[df["Year"] == 1962]
sns.scatterplot(data=filtered_data, x="CO2 emissions (metric tons per capita)", y="gdpPercap")
<AxesSubplot:xlabel='CO2 emissions (metric tons per capita)', ylabel='gdpPercap'>
# Pearson's r
from scipy.stats import pearsonr
corr, p_value = pearsonr(filtered_data["CO2 emissions (metric tons per capita)"], filtered_data["gdpPercap"])
print("\n Pearson correlation of 'CO2 emissions (metric tons per capita)' and gdpPercap, year 1962: \n","Correlation value: ",corr,"p-value: ",p_value)
Pearson correlation of 'CO2 emissions (metric tons per capita)' and gdpPercap, year 1962: Correlation value: 0.9260816725019472 p-value: 1.1286792210038754e-46
unfiltered_data = df[df["Year"] != 1962]
unfiltered_data.groupby(by=["Year"]).corrwith(other=df["CO2 emissions (metric tons per capita)"]).sort_values("gdpPercap", ascending=False)["gdpPercap"].head(1)
Year 1967 0.938792 Name: gdpPercap, dtype: float64
new_filtered_data = df[df["Year"] == 1967]
import plotly.express as px
fig = px.scatter(new_filtered_data, x='CO2 emissions (metric tons per capita)', y="gdpPercap", color="continent",
size='pop', hover_data=['Country Name'], title="CO2 emissions (metric tons per capita)' and gdpPercap. Year 1967")
fig.show()
df.groupby("continent")["Energy use (kg of oil equivalent per capita)"].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| continent | ||||||||
| Africa | 198.0 | 700.642721 | 628.227685 | 9.715410 | 377.734680 | 451.382174 | 746.247275 | 3071.774832 |
| Americas | 188.0 | 1703.620453 | 2377.181918 | 219.075497 | 556.033108 | 749.029108 | 1384.585146 | 14608.009868 |
| Asia | 185.0 | 1867.280336 | 2590.043514 | 86.903767 | 345.370792 | 760.140852 | 1987.087308 | 12122.050603 |
| Europe | 239.0 | 3110.604287 | 1768.370162 | 350.101258 | 2045.782889 | 2954.266739 | 3853.373983 | 14746.031338 |
| Oceania | 20.0 | 3980.314420 | 1123.410756 | 1791.461322 | 3143.501420 | 4044.850674 | 4783.650230 | 5868.347097 |
fig = px.box(df, x="Energy use (kg of oil equivalent per capita)", y="continent")
fig.show()
I need to know if the data satisfy parametric requirements to use parametric tests.
americas_energy = df[df["continent"] == "Americas"]["Energy use (kg of oil equivalent per capita)"].dropna()
oceania_energy = df[df["continent"] == "Oceania"]["Energy use (kg of oil equivalent per capita)"].dropna()
africa_energy = df[df["continent"] == "Africa"]["Energy use (kg of oil equivalent per capita)"].dropna()
europe_energy = df[df["continent"] == "Europe"]["Energy use (kg of oil equivalent per capita)"].dropna()
asia_energy = df[df["continent"] == "Asia"]["Energy use (kg of oil equivalent per capita)"].dropna()
import scipy.stats as stats
#The Shapiro-Wilk test tests the null hypothesis that the data was drawn from a normal distribution.
print("\n",
stats.shapiro(americas_energy),"\n",
stats.shapiro(oceania_energy),"\n",
stats.shapiro(africa_energy),"\n",
stats.shapiro(europe_energy),"\n",
stats.shapiro(asia_energy))
ShapiroResult(statistic=0.5632225871086121, pvalue=1.5868403861741054e-21) ShapiroResult(statistic=0.9818098545074463, pvalue=0.9552662372589111) ShapiroResult(statistic=0.6753993034362793, pvalue=2.724574480963581e-19) ShapiroResult(statistic=0.889901876449585, pvalue=3.4700315537650184e-12) ShapiroResult(statistic=0.6609910130500793, pvalue=4.943039538112358e-19)
Shapiro-Wilk tests: rejected 4/5 null hypotheses. Data is not normal distributed
import scipy.stats as stats
#The Levene test tests the null hypothesis that all input samples are from populations with equal variances.
stats.levene(*[americas_energy, oceania_energy, africa_energy,europe_energy, asia_energy], center='median', proportiontocut=0.05)
LeveneResult(statistic=12.113489871462216, pvalue=1.4250677574810339e-09)
Leneve test: rejected null hypothesis.
# The Kruskal-Wallis H-test tests the null hypothesis that the population median of all of the groups are equal.
fvalue, pvalue = stats.kruskal(*[americas_energy, oceania_energy, africa_energy,europe_energy, asia_energy])
print(fvalue, pvalue)
302.0114932359461 3.989307514095183e-64
Kruskal-Wallis H-test: rejected null hypothesis.
# Post hoc pairwise test for multiple comparisons of mean rank sums (Dunn’s test).
# May be used after Kruskal-Wallis one-way analysis of variance by ranks to do pairwise comparisons
import scikit_posthocs as sp
dunn_test = sp.posthoc_dunn([americas_energy, oceania_energy, africa_energy,europe_energy, asia_energy])
dunn_test.columns =["americas_energy","oceania_energy","africa_energy","europe_energy","asia_energy"]
dunn_test.index =["americas_energy","oceania_energy","africa_energy","europe_energy","asia_energy"]
dunn_test
| americas_energy | oceania_energy | africa_energy | europe_energy | asia_energy | |
|---|---|---|---|---|---|
| americas_energy | 1.000000e+00 | 5.358423e-08 | 6.438289e-10 | 1.025511e-20 | 1.858753e-01 |
| oceania_energy | 5.358423e-08 | 1.000000e+00 | 4.148905e-16 | 1.125350e-01 | 1.779352e-09 |
| africa_energy | 6.438289e-10 | 4.148905e-16 | 1.000000e+00 | 9.844921e-58 | 1.479379e-06 |
| europe_energy | 1.025511e-20 | 1.125350e-01 | 9.844921e-58 | 1.000000e+00 | 1.130215e-26 |
| asia_energy | 1.858753e-01 | 1.779352e-09 | 1.479379e-06 | 1.130215e-26 | 1.000000e+00 |
fig = px.imshow(dunn_test)
fig.show()
Asia's and Americas' energy use means are similar. Oceania's and Europe's energy use means are similar.
europe_and_asia_after_1990 = df[((df["continent"] == "Europe") |
(df["continent"] == "Asia")) & (df["Year"] > 1990) &
(df['Imports of goods and services (% of GDP)'] < 97 ) ] # Deleted Outliers (Singapore is an exception)
europe_and_asia_after_1990.groupby("continent")["Imports of goods and services (% of GDP)"].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| continent | ||||||||
| Asia | 93.0 | 41.713928 | 23.454248 | 0.079506 | 25.393531 | 38.831290 | 58.350047 | 96.742045 |
| Europe | 111.0 | 41.761071 | 16.818978 | 17.345130 | 28.462519 | 37.691245 | 51.129504 | 88.512248 |
fig = px.box(europe_and_asia_after_1990, x="Imports of goods and services (% of GDP)", y="continent", hover_data=['Country Name', "Year"])
fig.show()
europe_imports =europe_and_asia_after_1990[europe_and_asia_after_1990["continent"] == "Europe"]["Imports of goods and services (% of GDP)"].dropna()
asia_imports = europe_and_asia_after_1990[europe_and_asia_after_1990["continent"] == "Asia"]["Imports of goods and services (% of GDP)"].dropna()
print("\n", stats.shapiro(europe_imports), "\n", stats.shapiro(asia_imports))
stats.levene(*[europe_imports,asia_imports], center='median', proportiontocut=0.05)
ShapiroResult(statistic=0.9278804063796997, pvalue=1.486705423303647e-05) ShapiroResult(statistic=0.9704142212867737, pvalue=0.032892148941755295)
LeveneResult(statistic=9.881593643140368, pvalue=0.0019207769451392669)
Shapiro-Wilk tests: rejected 1/2 null hypotheses. Data is not normal distributed Leneve test: rejected null hypothesis.
#The Mann-Whitney U test is used to compare differences between two independent groups when the dependent variable is either ordinal or continuous, but not normally distributed.
stats.mannwhitneyu(x=europe_imports, y=asia_imports)
MannwhitneyuResult(statistic=5034.0, pvalue=0.3811649093378452)
Can not reject the null hypothesis of identical average scores.
fig = px.line(df, x="Year", y="Population density (people per sq. km of land area)", color="continent",
line_group="Country Name", hover_name="Country Name",
title="Population density (people per sq. km of land area) across all years")
fig.show()
grouped_df = df.groupby("Country Name")
first_values = grouped_df.first()
first_values = first_values.reset_index()
df_1 = first_values[["Country Name","Life expectancy at birth, total (years)"]]
last_values = grouped_df.last()
last_values = last_values.reset_index()
df_2 = last_values[["Country Name","Life expectancy at birth, total (years)"]]
df_2.rename(columns={"Life expectancy at birth, total (years)": "last"}, errors="raise")
df_3 =pd.concat([df_1,df_2], axis=1)
df_3.columns = ['Country Name', 'first_life', 'drop', "last_life"]
df_3 = df_3.drop("drop", axis=1)
df_3["delta"] = df_3['last_life'].sub(df_3['first_life'], axis = 0)
df_3.sort_values(by=['delta'], ascending=False).head(5)[["Country Name","delta"]]
| Country Name | delta | |
|---|---|---|
| 118 | Tunisia | 30.860756 |
| 81 | Nepal | 30.599634 |
| 24 | China | 29.942098 |
| 88 | Oman | 27.016537 |
| 99 | Saudi Arabia | 26.650561 |
df_3_greater_than_zero = df_3[df_3["delta"] > 0] #Removed negative values because cause error
fig = px.scatter(df_3_greater_than_zero, x='delta', y="last_life", color="delta", title="Changes in life expectancy at birth",
size='delta', hover_data=['Country Name'],
labels={
"last_life": "Life expectancy at birth, total (years). Last record.",
"delta": "Difference between first and last record, (years)"
},)
fig.show()